Requirements

NOTE: This analysis requires at least 10Gb of RAM to run.

Parameters

input_folder <- "raw_input"
output_folder <- "results"
output_format <- ".pdf"
matrix_plot_depth <- 6
color_range <- c(-4, 4)

Requirements

NOTE: This analysis requires at least 10Gb of RAM to run.

Parsing taxonomic data

The function parse_hmp_qiime is a wrapper for extract_taxonomy made for this analysis. It might be made more generic in the future. The information on sample characteristics (e.g. body site) is saved in a table called mapping in the taxmap object.

library(metacoder)
v35_data <- parse_hmp_qiime(otu_file = "http://downloads.hmpdacc.org/data/HMQCP/otu_table_psn_v35.txt.gz",
                            mapping_file = "http://downloads.hmpdacc.org/data/HMQCP/v35_map_uniquebyPSN.txt.bz2")
v35_data
## `taxmap` object with data for 985 taxa and 45383 observations:
## 
## ----------------------------------------- taxa -----------------------------------------
## 1, 2, 3, 4, 5, 6, 7, 8, 9 ... 976, 977, 978, 979, 980, 981, 982, 983, 984, 985
## 
## -------------------------------------- taxon_data --------------------------------------
## # A tibble: 985 x 4
##   taxon_ids supertaxon_ids  rank            name
##       <chr>          <chr> <chr>           <chr>
## 1         1           <NA>                  Root
## 2         2              1     p   Acidobacteria
## 3         3              1     p  Actinobacteria
## 4         4              1     p Armatimonadetes
## 5         5              1     p   Bacteroidetes
## 6         6              1     p          CCM11b
## 7         7              1     p     Chloroflexi
## # ... with 978 more rows
## 
## --------------------------------------- obs_data ---------------------------------------
## # A tibble: 45,383 x 4,790
##   obs_taxon_ids       otu_id 700114607 700114380 700114716 700114798 700114710 700114614 700114755
##           <chr>        <chr>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1           452     OTU_97.1         0         0         0         0         0         0         0
## 2           734    OTU_97.10         0         0         0         0         0         0         0
## 3           243   OTU_97.100         0         0         0         0         0         0         0
## 4           735  OTU_97.1000         0         0         0         0         0         0         0
## 5           351 OTU_97.10000         0         0         0         0         0         0         0
## 6           179 OTU_97.10001         0         0         0         0         0         0         0
## 7           734 OTU_97.10002         0         0         0         0         0         0         0
## # ... with 4.538e+04 more rows, and 4781 more variables: 700114715 <dbl>, 700114706 <dbl>,
## #   700114613 <dbl>, 700114803 <dbl>, 700114714 <dbl>, 700114482 <dbl>, 700114439 <dbl>,
## #   700114658 <dbl>, 700114387 <dbl>, 700114441 <dbl>, 700114608 <dbl>, 700111759 <dbl>,
## #   700114603 <dbl>, 700106988 <dbl>, 700113609 <dbl>, 700114612 <dbl>, 700110821 <dbl>,
## #   700114486 <dbl>, 700114481 <dbl>, 700114377 <dbl>, 700114424 <dbl>, 700114712 <dbl>,
## #   700114713 <dbl>, 700114554 <dbl>, 700114606 <dbl>, 700114335 <dbl>, 700114707 <dbl>,
## #   700114442 <dbl>, 700114610 <dbl>, 700114437 <dbl>, 700114327 <dbl>, 700114328 <dbl>,
## #   700114271 <dbl>, 700114223 <dbl>, 700114012 <dbl>, 700114117 <dbl>, 700111749 <dbl>,
## #   700113560 <dbl>, 700114709 <dbl>, 700108534 <dbl>, 700113020 <dbl>, 700114005 <dbl>,
## #   700114212 <dbl>, 700114274 <dbl>, 700114162 <dbl>, 700108165 <dbl>, 700113059 <dbl>,
## #   700114330 <dbl>, 700111035 <dbl>, 700110827 <dbl>, 700114051 <dbl>, 700105883 <dbl>,
## #   700114282 <dbl>, 700114279 <dbl>, 700113019 <dbl>, 700114750 <dbl>, 700114386 <dbl>,
## #   700113502 <dbl>, 700109121 <dbl>, 700111515 <dbl>, 700110308 <dbl>, 700114113 <dbl>,
## #   700113060 <dbl>, 700111520 <dbl>, 700114431 <dbl>, 700114006 <dbl>, 700111243 <dbl>,
## #   700114105 <dbl>, 700114056 <dbl>, 700110426 <dbl>, 700106490 <dbl>, 700110433 <dbl>,
## #   700114385 <dbl>, 700114438 <dbl>, 700111521 <dbl>, 700113016 <dbl>, 700111518 <dbl>,
## #   700111306 <dbl>, 700114440 <dbl>, 700113652 <dbl>, 700114048 <dbl>, 700111449 <dbl>,
## #   700110831 <dbl>, 700114112 <dbl>, 700111824 <dbl>, 700113017 <dbl>, 700114333 <dbl>,
## #   700114004 <dbl>, 700110654 <dbl>, 700114611 <dbl>, 700114210 <dbl>, 700114339 <dbl>,
## #   700111305 <dbl>, 700114170 <dbl>, 700111163 <dbl>, 700110657 <dbl>, 700110173 <dbl>,
## #   700114711 <dbl>, 700113116 <dbl>, 700111581 <dbl>, ...
## 
## ------------------------------------- taxon_funcs -------------------------------------
## n_obs, n_obs_1, n_supertaxa, n_subtaxa, n_subtaxa_1, hierarchies

Read abundance

I will calculate total read abundance per taxon for later graphing and filtering.

calculate_abundance <- function(data) {
  obs_samples <- colnames(data$obs_data) %in% data$mapping$sample_id
  mutate_taxa(data,
              abundance = vapply(obs(data), 
                                 function(i) sum(data$obs_data[i, obs_samples]), numeric(1)))
}
v35_data <- calculate_abundance(v35_data)

Plot of everything

To get an idea of how the data looks overall I will make a plot showing OTU and read abundance of all the data combined.

plot_all <- function(data, output_name, seed = 1) {
  set.seed(seed)
  data %>%
    filter_taxa(abundance >= 100) %>%
    filter_taxa(name != "") %>% # Some taxonomic levels are not named
    heat_tree(node_size = n_obs,
              node_size_axis_label = "Number of OTUs",
              node_color = abundance,
              node_color_trans = "area",
              node_color_axis_label = "Number of reads",
              node_label = name,
              node_label_max = 100,
              overlap_avoidance = 1,
              output_file = file.path(output_folder, paste0(output_name, output_format)))
}

plot_all(v35_data, "hmp--v35--all_data")

Plot body site differences

The HMP dataset is great for comparing treatment since there are many body sites with many replicates so statistical tests can be used to find real correlation between body sites and taxon abundance. The code below applies the Wilcox rank-sum test to differences in median read proportion between every pair of body sties compared. Since the data is compositional in nature (i.e. not idependent samples) we used a non-parametric test and used median instead of mean read proportion.

calculate_prop <- function(data, site) {
  sample_cols <- as.character(unlist(data$mapping[data$mapping$body_site == site, "sample_id"]))
  sample_cols <- sample_cols[sample_cols %in% colnames(data$obs_data)]
  obs_indexes <- obs(data)
  total_counts <- vapply(sample_cols, function(s) sum(data$obs_data[, s]), numeric(1))
  col_name <- paste0(site, "_median_prop")
  lapply(obs_indexes,
         function(i) {
           vapply(sample_cols, 
                  function(s) sum(data$obs_data[i, s]) / total_counts[s],
                  numeric(1))})
}

calculate_prop_diff <- function(data, site_1, site_2) {
  
  remove_inf <- function(values) {
    values[values == Inf] <- 10000000000000000000000000
    values[values == -Inf] <- -10000000000000000000000000
    values[is.nan(values)] <- 0
    return(values)
  }
  
  props_1 <- calculate_prop(data, site_1)
  props_2 <- calculate_prop(data, site_2)
  p_value <- mapply(function(x, y) wilcox.test(x, y)$p.value,
                    props_1, props_2)
  p_value <- p.adjust(p_value, method = "fdr")
  result <- ifelse(p_value > 0.05 | is.nan(p_value), 0,
                   log2(vapply(props_1, median, numeric(1)) / vapply(props_2, median, numeric(1))))
  remove_inf(result)
}

plot_body_site_diff <- function(data, site_1, site_2, output_name, seed = 1) {
  set.seed(seed)
  data %>%
    mutate_taxa(median_prop_diff = calculate_prop_diff(data, site_1, site_2)) %>%
    filter_taxa(abundance >= 100) %>%
    filter_taxa(name != "") %>% # Some taxonomic levels are not named
    heat_tree(node_size_axis_label = "Number of OTUs",
              node_size = n_obs,
              node_color_axis_label = "Log 2 ratio of median proportions",
              node_color = median_prop_diff,
              node_color_range = diverging_palette(),
              node_color_trans = "linear",
              node_color_interval = color_range,
              edge_color_interval = color_range,
              node_label = name,
              node_label_max = 150,
              output_file = file.path(output_folder, paste0(output_name, "--", site_1, "_vs_", site_2, output_format)))
}
plot_body_site_diff(v35_data, "Throat", "Saliva", "hmp--v35")

plot_body_site_diff(v35_data,  "Supragingival_plaque", "Subgingival_plaque", "hmp--v35")

plot_body_site_diff(v35_data,  "Anterior_nares", "Buccal_mucosa", "hmp--v35")

Conventional bubble plot

Here is an example of a more conventional way of displaying the same data that does not supply taxonomic context.

library(ggplot2)
sites_used = c("Stool", "Saliva", "Tongue_dorsum", "Buccal_mucosa", "Anterior_nares")
plot_data <- v35_data
plot_data$taxon_data[sites_used] <- lapply(sites_used, 
                                          function(x) sapply(calculate_prop(plot_data, x), mean))
plot_data <- filter_taxa(plot_data, rank == "g")$taxon_data
plot_data <- plot_data[rev(order(plot_data$abundance)), ]
plot_data <- plot_data[1:10, c("name", sites_used)]
plot_data$y <- 10:1
plot_data <- tidyr::gather(plot_data, site, prop, -name, -y)
plot_data$x <- rep(seq_along(sites_used), each = 10)
bubble_plot <- ggplot(plot_data, aes(x = x, y = y, size = prop)) + 
  geom_point() + 
  scale_x_continuous(breaks = seq_along(sites_used), labels = sites_used, limits = c(1, 6)) +
  scale_y_continuous(breaks = 1:10, labels = rev(unique(plot_data$name)), limits = c(1, 10)) +
  guides(size = FALSE) +
  ggtitle("Abundant Genera") +
  theme(panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank(),
        axis.text.x = element_text(angle = -20, hjust = 0),
        legend.background = element_blank()) 
ggsave(file.path(output_folder, "bubble_plot.pdf"), width = 5, height = 4)
print(bubble_plot)

Plot body site difference matrix

The graphs above are great for comparing two treatments, but it would be nice to see how many treatments compare in a single graph. To do this, we have developed a pair-wise matrix layout for comparisons of this kind.

# Identify pairs of treatments to compare
body_sites <- c("Stool", "Saliva", "Tongue_dorsum", "Buccal_mucosa", "Anterior_nares")
combinations <- t(combn(seq_along(body_sites), 2))
site_diff_cols <- apply(combinations, MARGIN = 1, function(i) paste0(body_sites[i], collapse = "-"))

layout_matrix <- matrix(rep(NA, (length(body_sites))^2), nrow = length(body_sites))
for (index in 1:nrow(combinations)) {
  layout_matrix[combinations[index,1], combinations[index,2]] <- index
}

# Make reduced taxonomy for plotting
v35_reduced <- v35_data

# Calculate significant body site differneces
not_used <- apply(combinations, MARGIN = 1, function(i) {
  v35_reduced$taxon_data[[paste0(body_sites[i], collapse = "-")]] <<- calculate_prop_diff(v35_reduced, body_sites[i[1]],  body_sites[i[2]])
})

# Remove data not needed for plotting
v35_reduced$obs_data <- v35_reduced$obs_data[, c("obs_taxon_ids", "otu_id")] # remove per-otu data since it is big

# Record taxa with no differences in any treatment
v35_reduced <- mutate_taxa(v35_reduced, 
                           is_diff = apply(v35_reduced$taxon_data[ , site_diff_cols], MARGIN = 1,
                                           function(a_row) any(a_row != 0)))
# Make individual plots
plot_body_site_diff_small <- function(data, diff_col, seed = 1) {
  set.seed(seed)
  data %>%
    mutate_taxa(diff_col = data$taxon_data[[diff_col]]) %>%
    filter_taxa(n_supertaxa <= matrix_plot_depth) %>%
    filter_taxa(abundance > 1000) %>%
    filter_taxa(name != "") %>% # Some taxonomic levels are not named
    heat_tree(node_size = n_obs,
              node_color = diff_col,
              node_color_range = diverging_palette(),
              node_color_trans = "linear",
              node_color_interval = color_range,
              edge_color_interval = color_range,
              overlap_avoidance = 0.7,
              node_size_axis_label = "Number of OTUs",
              node_color_axis_label = "Median proportion difference",
              make_legend = FALSE)
}


sub_plots <- lapply(site_diff_cols, function(x) plot_body_site_diff_small(v35_reduced, x))
key_plot <- v35_reduced %>% 
  filter_taxa(n_supertaxa <= matrix_plot_depth) %>%
  filter_taxa(abundance > 1000) %>%
  filter_taxa(name != "") %>% # Some taxonomic levels are not named
  heat_tree(node_size = n_obs,
            node_color = "#DDDDDD",
            node_color_range = diverging_palette(),
            node_color_trans = "linear",
            node_color_interval = color_range,
            node_label = ifelse(n_obs > 1000, name, NA),
            edge_label = ifelse(n_obs > 1000, NA, name),
            node_label_max = 500,
            edge_label_max = 500,
            edge_label_size_range = c(0.007, 0.02),
            node_label_size_range = c(0.007, 0.02),
            overlap_avoidance = 0.7,
            node_size_axis_label = "Number of OTUs",
            node_color_axis_label = "Log 2 ratio of median proportions",
            make_legend = TRUE)
calc_subplot_coords <- function(a_matrix, x1 = 0, y1 = 0, x2 = 1, y2 = 1) {
  # lowerleft = c(x1, y1), upperright = c(x2, y2)
  x_coords <- seq(from = x1, to = x2, length.out = ncol(a_matrix) + 1)[- (ncol(a_matrix) + 1)]
  y_coords <- seq(from = y1, to = y2, length.out = nrow(a_matrix) + 1)[- (nrow(a_matrix) + 1)]
  do.call(rbind, lapply(1:ncol(a_matrix), function(x) data.frame(plot_index = a_matrix[, x],
                                                                 x = x_coords[x], 
                                                                 y = rev(y_coords))))
}

# remove empty column/row
layout_matrix <- layout_matrix[! apply(layout_matrix, MARGIN = 1, function(x) all(is.na(x))), ]
layout_matrix <- layout_matrix[ ,! apply(layout_matrix, MARGIN = 2, function(x) all(is.na(x)))]

# Get subplot layout data
matrix_data <- calc_subplot_coords(layout_matrix, x1 = 0.2, y1 = 0.2, x2 = 0.95, y2 = 0.95)
matrix_data$site <- site_diff_cols[layout_matrix]
matrix_data <- tidyr::separate(matrix_data, site, c("site_1", "site_2"), "-")
matrix_data <- matrix_data[!is.na(matrix_data$plot_index), ]
matrix_data <- matrix_data[order(matrix_data$plot_index), ]
rownames(matrix_data) <- matrix_data$plot_index

# Make label data
named_row <- which(apply(layout_matrix, MARGIN = 1, function(x) all(!is.na(x))))
named_col <- which(apply(layout_matrix, MARGIN = 2, function(x) all(!is.na(x))))
horz_label_data <- matrix_data[match(layout_matrix[named_row, ], matrix_data$plot_index), ]
vert_label_data <- matrix_data[match(layout_matrix[, named_col], matrix_data$plot_index), ]
subgraph_width <- abs(horz_label_data$x[1] - horz_label_data$x[2])
subgraph_height <- abs(vert_label_data$y[1] - vert_label_data$y[2])
horz_label_data$label_x <- horz_label_data$x + subgraph_width / 2 # center of label
horz_label_data$label_y <- 0.96 # bottom of label
vert_label_data$label_x <- 0.96 # bottom of rotated label 
vert_label_data$label_y <- vert_label_data$y + subgraph_height / 2 # center of rotated label 

# Make plot
library(cowplot)
library(metacoder)
label_size <- 12
matrix_plot <- ggdraw() + 
  draw_plot(key_plot, x = 0, y = -0.03, width = 0.78, height = 0.67) +
  draw_text(gsub("_", " ", horz_label_data$site_2), 
            x = horz_label_data$label_x, y = horz_label_data$label_y, 
            size = label_size, colour = diverging_palette()[1],
            hjust = "center", vjust = "bottom") +
  draw_text(gsub("_", " ", vert_label_data$site_1), 
            x = vert_label_data$label_x, y = vert_label_data$label_y, 
            size = label_size, colour = diverging_palette()[3],
            hjust = "center", vjust = "bottom", angle = -90)
for (i in seq_along(sub_plots)) {
  matrix_plot <- matrix_plot + draw_plot(sub_plots[[i]], 
                                         x = matrix_data[i, "x"],
                                         y = matrix_data[i, "y"],
                                         width = subgraph_width, height = subgraph_height)
}
print(matrix_plot)

save_plot(file.path(output_folder, "figure_3--hmp_matrix_plot.pdf"), 
          matrix_plot, base_width = 7.5, base_height = 7.5)